In [43]:
from IPython.display import display
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter("ignore")
%matplotlib inline 
data_16 = pd.read_csv(r'C:\New_partition\Thesis\Files\Files\shap\machine_16.csv',index_col=0)
data_0 = pd.read_csv(r'C:\New_partition\Thesis\Files\Files\shap\machine_0.csv',index_col=0)
data_9 = pd.read_csv(r'C:\New_partition\Thesis\Files\Files\shap\machine_9.csv',index_col=0)
In [44]:
import cufflinks as cf
import plotly
In [45]:
import plotly.io as pio
pio.renderers.default='notebook'
In [46]:
import plotly.offline as py
import plotly.graph_objs as go
cf.go_offline() # required to use plotly offline (no account required).
py.init_notebook_mode() # graphs charts inline (IPython).
In [47]:
import plotly.express as px
In [48]:
import numpy as np

Visualizing Data

I am visualizing data from multiple machines to see a trend of data.

In [49]:
data_16.columns
Out[49]:
Index(['0', '1', '2', '3'], dtype='object')
In [50]:
# labelling the columns as S1, S2, S3,S4 which represent sensor1, sensor2, sensor3 and sensor4
data_16.columns=['S1','S2','S3','S4']
In [51]:
pio.renderers.default='svg'
fig_2= px.line(data_16.iloc[:,0:4])
fig_2.show()
Jan 2019Jul 2019Jan 2020Jul 2020Jan 2021Jul 2021−300−200−1000100200300variableS1S2S3S4indexvalue
In [52]:
pio.renderers.default='svg'
data_9.columns=['S1','S2','S3','S4']
fig_3= px.line(data_9.iloc[:,0:4])
fig_3.show()
Jan 2019Jul 2019Jan 2020Jul 2020Jan 2021Jul 2021−300−200−1000100200300variableS1S2S3S4indexvalue
In [53]:
pio.renderers.default='svg'
data_0.columns=['S1','S2','S3','S4']
fig_3= px.line(data_0.iloc[:,0:4])
fig_3.show()
Jan 2019Jul 2019Jan 2020Jul 2020Jan 2021Jul 2021−400−300−200−1000100200300variableS1S2S3S4indexvalue

I have found in both the machines, the data plotted above follows the same pattern. There are some clear outliers. By zooming the data points I found values greater than 260 and less than -260 are in range of outliers.I have also observed that during the normal period the data values of sensors are greater than 0. In failure it is close to zero and in case of fault it peaks really high.

In [14]:
#data=data[~( (data.iloc[:,1:2] > 260) | (data.iloc[:,1:2]<-260) ).all(axis=1) & ~( (data.iloc[:,0:1] > 260) | (data.iloc[:,0:1]<-260) ).all(axis=1) & ~( (data.iloc[:,2:3] > 260) | (data.iloc[:,2:3]<-260) ).all(axis=1) & ~( (data.iloc[:,3:4] > 260) | (data.iloc[:,3:4]<-260) ).all(axis=1) ]

Removing Outliers

In [54]:
#detecting the outliers 
data_16[((data_16 > 260) | (data_16 <-260))] = np.nan
In [55]:
# replacing the outliers with the mean of last non outlier window of that sensor
for k in range(0,4):
    indices = np.where(data_16.iloc[:,k:k+1].isna())[0]
    for i in indices:
        data_16.iloc[i:i+1,k:k+1]=data_16.iloc[0:i,k:k+1].mean()[0]
In [56]:
pio.renderers.default='svg'
fig_4= px.line(data_16.iloc[:,0:4])
fig_4.show()
Jan 2019Jul 2019Jan 2020Jul 2020Jan 2021Jul 2021−200−1000100200variableS1S2S3S4indexvalue

Taking Derivative

In [57]:
data_m=data_16.diff()
In [58]:
fig_5= px.line(data_m.iloc[:,0:4])
fig_5.show('svg')
Jan 2019Jul 2019Jan 2020Jul 2020Jan 2021Jul 2021−400−2000200400variableS1S2S3S4indexvalue
In [59]:
data_16['label']='0'
In [60]:
data_16.head(10)
Out[60]:
S1 S2 S3 S4 label
2019-01-01 00:00:00.000000000 12.623363 8.820046 -11.808870 10.073501 0
2019-01-01 08:00:09.603201067 10.826310 2.784958 11.559553 21.897843 0
2019-01-01 16:00:19.206402134 21.095022 -0.630819 -17.840214 -1.345860 0
2019-01-02 00:00:28.809603201 32.312872 6.540248 -13.499914 -4.257744 0
2019-01-02 08:00:38.412804268 28.057312 253.690365 21.970890 13.635885 0
2019-01-02 16:00:48.016005335 29.630700 -3.526247 3.352673 -2.994676 0
2019-01-03 00:00:57.619206402 38.888923 1.932672 -10.842909 -17.720486 0
2019-01-03 08:01:07.222407469 33.059533 3.207766 25.480815 1.122144 0
2019-01-03 16:01:16.825608536 25.195936 -5.472143 22.947462 -3.479043 0
2019-01-04 00:01:26.428809603 29.549700 -3.323675 -5.881249 -25.136276 0
In [61]:
#data_m.to_csv(r'C:\New_partition\To_delete\data_wo_out.csv')

Labelling data as 'normal', 'fault','failure'

Based on a specific threshold value I am labelling the data points as fault, failure and normal. I have 3 specific criterais:

  1. the abs difference are less than 1; failure
  2. difference < 108 or >-108: normal
  3. difference >108 or <-108; fault
In [62]:
# based on a specific threshold value I am labelling the data points as fault, failure and normal
for i in range(0,len(data_m)):
    if (abs(data_m.S1[i])<=1) & (abs(data_m.S2[i])<=1) & (abs(data_m.S3[i])<=1) & (abs(data_m.S4[i])<=1):
        data_16.label[i]='failure'
    elif ((data_m.S1[i]<=108) & (data_m.S1[i]>=-108)) & ((data_m.S2[i]<=108) & (data_m.S2[i]>=-108)) & ((data_m.S3[i]<=108) & (data_m.S3[i]>=-108)) & ((data_m.S4[i]<=108) & (data_m.S4[i]>=-108)):
        data_16.label[i]='normal'
    elif ((data_m.S1[i]>=108) or (data_m.S1[i]<=-108)) or ((data_m.S2[i]>=108) or (data_m.S2[i]<=-108)) or ((data_m.S3[i]>=108) or (data_m.S3[i]<=-108)) or ((data_m.S4[i]>=108) or (data_m.S4[i]<=-108)):
        #print(data_m.a[i])
        #print(data_m.b[i])
        #print(data_m.c[i])
        #print(data_m.d[i])
        data_16.label[i]='fault'
In [63]:
data_16=data_16[data_16['label']!='0']
In [64]:
data_16
Out[64]:
S1 S2 S3 S4 label
2019-01-01 08:00:09.603201067 10.826310 2.784958 11.559553 21.897843 normal
2019-01-01 16:00:19.206402134 21.095022 -0.630819 -17.840214 -1.345860 normal
2019-01-02 00:00:28.809603201 32.312872 6.540248 -13.499914 -4.257744 normal
2019-01-02 08:00:38.412804268 28.057312 253.690365 21.970890 13.635885 fault
2019-01-02 16:00:48.016005335 29.630700 -3.526247 3.352673 -2.994676 fault
... ... ... ... ... ...
2021-09-25 15:59:21.587195728 0.005583 -0.014235 0.022359 -0.004620 failure
2021-09-25 23:59:31.190396800 -0.000879 0.004775 0.006045 -0.009040 failure
2021-09-26 07:59:40.793597872 0.005554 -0.007923 -0.001051 0.000959 failure
2021-09-26 15:59:50.396798944 0.007716 0.028513 -0.001805 -0.002062 failure
2021-09-27 00:00:00.000000000 -0.002468 -0.006753 0.009061 0.008156 failure

2999 rows × 5 columns

In [65]:
import seaborn as sns
import matplotlib.pyplot as plt
In [66]:
fig_6 = px.scatter(data_16, x=data_16.index, y=data_16.S1, color="label")
fig_7 = px.scatter(data_16, x=data_16.index, y=data_16.S2, color="label")
fig_8 = px.scatter(data_16, x=data_16.index, y=data_16.S3, color="label")
fig_9 = px.scatter(data_16, x=data_16.index, y=data_16.S4, color="label")
In [67]:
pio.renderers.default='svg'
fig_6.show()
fig_7.show()
fig_8.show()
fig_9.show()
Jan 2019Jul 2019Jan 2020Jul 2020Jan 2021Jul 2021−200−1000100200labelnormalfaultfailureindexS1
Jan 2019Jul 2019Jan 2020Jul 2020Jan 2021Jul 2021−200−1000100200labelnormalfaultfailureindexS2
Jan 2019Jul 2019Jan 2020Jul 2020Jan 2021Jul 2021−200−1000100200labelnormalfaultfailureindexS3
Jan 2019Jul 2019Jan 2020Jul 2020Jan 2021Jul 2021−200−1000100200labelnormalfaultfailureindexS4

The above graph shows the data points of each sensor. Color signifies which sensor data points are classified normal, fault and failure.

Discussion

  • First I have removed the outlier by carefully visualizing the data and setting the threshold that any value greater than or less than 260 is an outlier.
  • I have then removed outliers of each column from data of machine 16 and replace them with the average of last non outlier window
  • In the third step, I have taken the derivative of the signals for S1, S2, S3 and S4. I then applied conditions to label the data as normal, fault, and failure according to conditions
  • I can repeat the same steps for the data of other machines and pinpoint the time of fault in those machines as well.

Efficacy of my approach

My approach will pipoint the time as soon as it detects the sensor value lies in fault region and failure region. This would help ExampleCo.Inc to quickly take actions. This is show in the figure above. You can see the data points which are classified as faulty, normal and failure.

Limitations of my approach

I have set a hard threshold based on visualization of data for detecting fault and failure. Setting a hard threshold is not effective sometimes because if the fault or failure value at some point in time lies a little below, my approach might miss the fault.

Other approaches

I could modify my existing approach by using a median filter or other denoising technique to smooth out the data because it is mentioned in the description "When a machine is operating in normal mode the data behaves in a fairly predictable way, but with a moderate amount of noise." It will be effective to smooth the signal but not too much as to change the value. Also I could utilize another technique to remove outliers such as using Z scores, interquartile range to create outlier fences.

Application to Data for Machine 0

I have applied the above mentioned steps to data of Machine 0 and I have pipoint the time of the fault, failure and normal mode.

In [68]:
#detecting the outliers 
data_0[((data_0 > 260) | (data_0 <-260))] = np.nan
# replacing the outliers with the mean of last non outlier window of that sensor
for k in range(0,4):
    indices = np.where(data_0.iloc[:,k:k+1].isna())[0]
    for i in indices:
        data_0.iloc[i:i+1,k:k+1]=data_0.iloc[0:i,k:k+1].mean()[0]
data_m=data_0.diff()
data_0['label']='0'
# based on a specific threshold value I am labelling the data points as fault, failure and normal
for i in range(0,len(data_m)):
    if (abs(data_m.S1[i])<=1) & (abs(data_m.S2[i])<=1) & (abs(data_m.S3[i])<=1) & (abs(data_m.S4[i])<=1):
        data_0.label[i]='failure'
    elif ((data_m.S1[i]<=108) & (data_m.S1[i]>=-108)) & ((data_m.S2[i]<=108) & (data_m.S2[i]>=-108)) & ((data_m.S3[i]<=108) & (data_m.S3[i]>=-108)) & ((data_m.S4[i]<=108) & (data_m.S4[i]>=-108)):
        data_0.label[i]='normal'
    elif ((data_m.S1[i]>=108) or (data_m.S1[i]<=-108)) or ((data_m.S2[i]>=108) or (data_m.S2[i]<=-108)) or ((data_m.S3[i]>=108) or (data_m.S3[i]<=-108)) or ((data_m.S4[i]>=108) or (data_m.S4[i]<=-108)):
        #print(data_m.a[i])
        #print(data_m.b[i])
        #print(data_m.c[i])
        #print(data_m.d[i])
        data_0.label[i]='fault'
data_0=data_0[data_0['label']!='0']
fig_6 = px.scatter(data_0, x=data_0.index, y=data_0.S1, color="label")
fig_7 = px.scatter(data_0, x=data_0.index, y=data_0.S2, color="label")
fig_8 = px.scatter(data_0, x=data_0.index, y=data_0.S3, color="label")
fig_9 = px.scatter(data_0, x=data_0.index, y=data_0.S4, color="label")
fig_6.show()
fig_7.show()
fig_8.show()
fig_9.show()
Jan 2019Jul 2019Jan 2020Jul 2020Jan 2021Jul 2021−200−1000100200labelnormalfaultfailureindexS1
Jan 2019Jul 2019Jan 2020Jul 2020Jan 2021Jul 2021−200−1000100200labelnormalfaultfailureindexS2
Jan 2019Jul 2019Jan 2020Jul 2020Jan 2021Jul 2021−200−1000100200labelnormalfaultfailureindexS3
Jan 2019Jul 2019Jan 2020Jul 2020Jan 2021Jul 2021−200−1000100200labelnormalfaultfailureindexS4
In [ ]: